{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import data_loader\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import pickle\n",
    "import os\n",
    "import nltk\n",
    "import re\n",
    "import timeit\n",
    "\n",
    "from torch.autograd import Variable\n",
    "import torch\n",
    "\n",
    "from sklearn import preprocessing, svm, metrics\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold\n",
    "from sklearn.externals import joblib\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import PorterStemmer\n",
    "from sklearn.model_selection import cross_val_score\n",
    "\n",
    "from util.classification.lstm_pos_tagger import LSTMPOSTagger\n",
    "\n",
    "meta_list, data_list = data_loader.load_data(load_train=True, load_dev=True, load_test=True)\n",
    "\n",
    "train_meta, train_meta_corrected, \\\n",
    "dev_meta, dev_meta_corrected, \\\n",
    "test_meta, test_meta_corrected = meta_list\n",
    "\n",
    "train_data, train_data_corrected, \\\n",
    "dev_data, dev_data_corrected, \\\n",
    "test_data, test_data_corrected = data_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# of Sentence: 4124\n",
      "Sentence distribution:\n",
      "            # of sentences\n",
      "Russian                427\n",
      "French                 401\n",
      "Spanish                428\n",
      "Japanese               407\n",
      "Chinese                414\n",
      "Turkish                404\n",
      "Portuguese             407\n",
      "Korean                 413\n",
      "German                 400\n",
      "Italian                423\n",
      "Author distribution:\n",
      "            # of authors\n",
      "Russian               81\n",
      "French               131\n",
      "Spanish              175\n",
      "Japanese              81\n",
      "Chinese               66\n",
      "Turkish               73\n",
      "Portuguese            68\n",
      "Korean                84\n",
      "German                69\n",
      "Italian               76\n",
      "Exam score stats:\n",
      "            count       mean       std   max   min\n",
      "Russian     427.0  26.288056  6.179166  40.0   9.0\n",
      "French      401.0  27.630923  4.666738  40.0  17.0\n",
      "Spanish     428.0  26.789720  5.349402  40.0  11.0\n",
      "Japanese    407.0  27.547912  5.040432  39.0  15.0\n",
      "Chinese     414.0  26.268116  6.210832  40.0  14.0\n",
      "Turkish     404.0  27.834158  5.494389  39.0   7.0\n",
      "Portuguese  407.0  27.791155  4.963723  39.0  11.0\n",
      "Korean      413.0  25.980630  6.019355  40.0  12.0\n",
      "German      400.0  27.725000  5.880546  40.0  13.0\n",
      "Italian     423.0  28.699764  4.388392  38.0  20.0\n"
     ]
    }
   ],
   "source": [
    "languages = train_meta[\"native_language\"].unique()\n",
    "print(\"# of Sentence: {}\".format(len(train_meta)))\n",
    "print(\"Sentence distribution:\")\n",
    "stats = []\n",
    "for language in languages:\n",
    "    stats.append(len(train_meta[train_meta[\"native_language\"]==language]))\n",
    "stats_df = pd.DataFrame(stats, columns=[\"# of sentences\"], index=languages)\n",
    "print(stats_df)\n",
    "\n",
    "print(\"Author distribution:\")\n",
    "stats = []\n",
    "for language in languages:\n",
    "    stats.append(len(train_meta[train_meta[\"native_language\"]==language][\"doc_id\"].unique()))\n",
    "stats_df = pd.DataFrame(stats, columns=[\"# of authors\"], index=languages)\n",
    "print(stats_df)\n",
    "\n",
    "stats = []\n",
    "languages = train_meta[\"native_language\"].unique()\n",
    "print(\"Exam score stats:\")\n",
    "for language in languages:\n",
    "    stats.append(train_meta[train_meta[\"native_language\"]==language][\"score\"].describe()[['count', 'mean', 'std', 'max', 'min']])\n",
    "stats_df = pd.DataFrame(stats, index=languages)\n",
    "print(stats_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
